Cluster - Aufgabe 2 Skript

Einlesen der Daten und Darstellung. Da der Datensatz nur 3 Variablen hat, bietet sich 3D Darstellung an. Aber auch andere Darstellungen sind möglich!

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cluster.dat <- read_csv2( "cluster_example.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## New names:Rows: 100 Columns: 4── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (4): ...1, Z1, Z2, Z3
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 3D Darstellung
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(
  cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3) %>%
  add_markers()

EM Algorithmus ist im Paket mclust implementiert. Bei Clusterverfahren muss sich Gedanken zur Wahl der Anzahl an Clustern gemacht werden, z.B. mittels Betrachtung Informationskriterien wie BIC.

# EM Algorithmus
library(mclust)
## Package 'mclust' version 6.0.1
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
# Anzahl an Clustern
BIC <- mclustBIC(cluster.dat, G=1:10, model = c("EII", "EEI"))
plot(BIC)

Der in diesem Paket implementierte Algorithmus hat zur Bestimmung der Startwerte hierarchisches Clustern implementiert und entscheidet darüber auch die Wahl der Cluster.

# Cluster - Hierarchiches Clustern zur Initialisierung
# Auswahl Cluster = 2
mc <- Mclust (cluster.dat)
# Ergebnis
summary ( mc )
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 2 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -1540.838 100 23 -3187.595 -3191.677
## 
## Clustering table:
##  1  2 
## 46 54

Die ersten 6 Zeilen der Matrix mit den Wahrscheinlichkeiten, dass eine Beobachtung entweder zu Cluster 1 oder zu Cluster 2 gehört.

head(mc$z)
##           [,1]         [,2]
## [1,] 0.9999998 1.506185e-07
## [2,] 0.9998465 1.534586e-04
## [3,] 1.0000000 4.434705e-11
## [4,] 0.9993320 6.680474e-04
## [5,] 0.9993203 6.796786e-04
## [6,] 1.0000000 1.470275e-09

Darstellung mit der Farbe gemäß der Klasse, die die größte Wahrschenlichkeit hat.

plot_ly(
  cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3,
  color = mc$classification) %>%
  add_markers()
# Ausprobieren von 3 Clusterklassen
# eine Klasse ist staerker besetzt als die zwei anderen Klassen
mc.3 <- Mclust (cluster.dat, G=3)
summary(mc.3)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVE (ellipsoidal, equal orientation) model with 3 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -1531.627 100 32 -3210.619 -3218.583
## 
## Clustering table:
##  1  2  3 
## 26 20 54
# 3D grafik mit Farbe der zugeordneten Cluster
plot_ly(
  cluster.dat, x = ~Z1, y = ~Z2, z = ~Z3,
  color = mc.3$classification) %>%
  add_markers()

Bayessche Netze

Weitergehende Infos zum R Paket gRain z.B. in der Publikation https://www.jstatsoft.org/article/view/v046i10

Hinweise zur Installation: https://people.math.aau.dk/~sorenh/software/gR/

Quelle der Beispiele: http://www.di.fc.ul.pt/~jpn/r/bayesnets/bayesnets.html (Abruf 17.12.2020)

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.18")
BiocManager::install(c("graph", "RBGL", "Rgraphviz"))
install.packages("gRbase", dependencies=TRUE); 
install.packages("gRain", dependencies=TRUE); 
install.packages("gRim", dependencies=TRUE)

Aufgabe Krebsdiagnostik

Krebsdiagnostik
Krebsdiagnostik
library(gRain)
library(gRbase)
library(gRim)
#library(Rgraphviz)
# the possible values (all nodes are boolean vars)
yn <- c("yes","no")

# specify the CPTs
node.C <- cptable(~ C, values=c(1, 99), levels=yn)
node.T1 <- cptable(~ T1 + C, values=c(9,1,2,8), levels=yn)
node.T2 <- cptable(~ T2 + C, values=c(9,1,2,8), levels=yn)

# create an intermediate representation of the CPTs
plist <- compileCPT(list(node.C, node.T1, node.T2))
plist
##  P( C )
##  P( T1 | C )
##  P( T2 | C )
plist$C
## C
##  yes   no 
## 0.01 0.99
plist$T1
##      C
## T1    yes  no
##   yes 0.9 0.2
##   no  0.1 0.8
# create network
bn.cancer <- grain(plist)
summary(bn.cancer)
## Independence network: Compiled: TRUE Propagated: FALSE 
##  Nodes : chr [1:3] "C" "T1" "T2"
##  Number of cliques:                 2 
##  Maximal clique size:               2 
##  Maximal state space in cliques:    4
# The marginal probability for each variable:
querygrain(bn.cancer, nodes=c("C", "T1", "T2"), type="marginal")
## $C
## C
##  yes   no 
## 0.01 0.99 
## 
## $T1
## T1
##   yes    no 
## 0.207 0.793 
## 
## $T2
## T2
##   yes    no 
## 0.207 0.793
# The joint probability P(C,T1):
querygrain(bn.cancer, nodes=c("C","T1"), type="joint")
##      T1
## C       yes    no
##   yes 0.009 0.001
##   no  0.198 0.792
# P(T1=+ | T2=+):
#  1. add evidence to the net
bn.cancer.1 <- setFinding(bn.cancer, nodes=c("T2"), states=c("yes")) 
#  2. query the new net
querygrain(bn.cancer.1, nodes=c("T1"))
## $T1
## T1
##       yes        no 
## 0.2304348 0.7695652
# The probability of this evidence:
# print(getFinding(bn.cancer.1))
# The conditional P(not C | not T1)
bn.cancer.2 <- setFinding(bn.cancer, nodes=c("T1"), states=c("no")) 
querygrain(bn.cancer.2, nodes=c("C"))
## $C
## C
##         yes          no 
## 0.001261034 0.998738966

Aufgabe Koronare Herzkrankheit

data("cad1")
head(cad1)
##      Sex   AngPec        AMI QWave QWavecode    STcode STchange SuffHeartF
## 1   Male     None NotCertain    No    Usable    Usable       No         No
## 2   Male Atypical NotCertain    No    Usable    Usable       No         No
## 3 Female     None   Definite    No    Usable    Usable       No         No
## 4   Male     None NotCertain    No    Usable Nonusable       No         No
## 5   Male     None NotCertain    No    Usable Nonusable       No         No
## 6   Male     None NotCertain    No    Usable Nonusable       No         No
##   Hypertrophi Hyperchol Smoker Inherit Heartfail CAD
## 1          No        No     No      No        No  No
## 2          No        No     No      No        No  No
## 3          No        No     No      No        No  No
## 4          No        No     No      No        No  No
## 5          No        No     No      No        No  No
## 6          No        No     No      No        No  No
# create the DAG
dag.cad <- dag(~ CAD:Smoker:Inherit:Hyperchol + 
                 AngPec:CAD + 
                 Heartfail:CAD + 
                 QWave:CAD)
plot(dag.cad)

# smooth is a small positive number to avoid zero entries in the CPTs
# (cf. Additive smoothing, http://en.wikipedia.org/wiki/Additive_smoothing)
bn.cad <- grain(dag.cad, data = cad1, smooth = 0.1)
# Let's ask some questions...
querygrain(bn.cad, nodes=c("CAD", "Smoker"), type="conditional") 
##      Smoker
## CAD          No       Yes
##   No  0.7172084 0.4933771
##   Yes 0.2827916 0.5066229